# datasets/pbmc3k.py
from __future__ import annotations

from typing import Optional, Dict, Any

import numpy as np

from .base import DatasetSpec
from .registry import register
from .transforms import default_preprocess

def _require_scanpy():
    try:
        import scanpy as sc  # noqa: F401
    except Exception as e:
        raise RuntimeError(
            "The PBMC3k loader uses 'scanpy' (and anndata). Install with:\n"
            "    pip install scanpy anndata\n"
        ) from e

@register("pbmc3k")
def load_pbmc3k(
    cache_dir: Optional[str] = None,
    *,
    preprocess: bool = True,
    pca_n: Optional[int] = 50,
    random_state: int = 0,
) -> DatasetSpec:
    """
    Fetch PBMC 3k via scanpy and return as DatasetSpec.
    Labels use scanpy's example annotations if present; otherwise None.
    """
    _require_scanpy()
    import scanpy as sc

    ad = sc.datasets.pbmc3k()  # downloads and caches automatically
    # try to annotate labels if available later in pipeline; here we keep None
    X = ad.X.toarray() if hasattr(ad.X, "toarray") else np.asarray(ad.X)
    X = X.astype(np.float32, copy=False)

    labels = None
    batch = None
    meta: Dict[str, Any] = {"scanpy_dataset": "pbmc3k", "n_cells": X.shape[0], "n_genes": X.shape[1]}

    if preprocess:
        Xp, info = default_preprocess(X, pca_n=pca_n, random_state=random_state)
        meta.update(info)
        return DatasetSpec(name="pbmc3k", X=Xp, labels=labels, batch=batch, meta=meta)
    return DatasetSpec(name="pbmc3k", X=X, labels=labels, batch=batch, meta=meta)
